# Author: Mark Richardson
# Purpose: Estimate skills ratings

# Open a log
library(logr)

lf <- log_open(file_name = "02_skills_ratings_estimation.log")

log_code()

# Load packages
library(coda)
library(dplyr)
library(forcats)
library(ggplot2)
library(readr)
library(runjags)
library(stringr)


# Load data
load("data/ratings/skills_ratings/skills_ratings_for_model.RData")

#### Wrangle the ratings ####

# Calculate number of ratings per rater
skills_ratings <- skills_ratings %>%
  rowwise() %>%
  mutate(n_ratings = sum(!is.na(c_across(skills_rating_1:skills_rating_5)))) %>%
  ungroup()

table(skills_ratings$n_ratings)

# Drop respondents with fewer than 2 ratings

skills_ratings <- skills_ratings %>% filter(n_ratings >= 2)

# Create a rater number

skills_ratings <- skills_ratings %>%
  mutate(num = 1:n(),
         rater = paste0("rater_", num))

# Remove extra space due to encoding issue between Java and Qualtircs - Â

skills_ratings <- skills_ratings %>%
  mutate(agency_rated_1 = str_squish(agency_rated_1),
         agency_rated_2 = str_squish(agency_rated_2),
         agency_rated_3 = str_squish(agency_rated_3),
         agency_rated_4 = str_squish(agency_rated_4),
         agency_rated_5 = str_squish(agency_rated_5))

#### Create the ratings matrix - rows are agencies, columns are raters ####

# Get agency list to identify rows for the matrix

ags <- na.omit(unique(c(skills_ratings$agency_rated_1,
                        skills_ratings$agency_rated_2,
                        skills_ratings$agency_rated_3,
                        skills_ratings$agency_rated_4,
                        skills_ratings$agency_rated_5)))

skills_matrix <- tibble(agency = ags)

rater_data <- skills_ratings %>% select(dept, office, rater) # rater index and department

rater_vect <- skills_ratings$rater

# Note: Look at duplicate issue more carefully - distinct() should keep first rating given order

for (i in 1:nrow(skills_ratings)) {
  
  # Get vector of ratings
  ratings_v <- skills_ratings %>%
    filter(rater == rater_vect[i]) %>%
    select(skills_rating_1:skills_rating_5) %>%
    unlist(., use.names = FALSE)
  
  # Get vector of rated agencies
  rated_v <- skills_ratings %>%
    filter(rater == rater_vect[i]) %>%
    select(agency_rated_1:agency_rated_5) %>%
    unlist(., use.names = FALSE)
  
  r <- tibble(agency = str_squish(rated_v),
              "{rater_vect[i]}" := ratings_v) %>%
    tidyr::drop_na(agency) %>%
    distinct(agency, .keep_all = TRUE)
    
  skills_matrix <- full_join(skills_matrix, r, by = "agency")

}

# Clean up
rm(ratings_v, rated_v, r)

#### Get ratings per agency ####

skills_matrix <- skills_matrix %>%
  rowwise() %>%
  mutate(n_ratings = sum(!is.na(c_across(!c(agency))))) %>%
  ungroup()

n <- skills_matrix %>%
  select(agency, n_ratings)

skills_matrix <- skills_matrix %>% filter(n_ratings >= 5) # Require at least 5 ratings per agency

# Check ratings per rater

ratings_per_rater <- skills_matrix %>%
  select(!c(agency, n_ratings)) %>%
  as.matrix() %>%
  is.na() %>%
  colSums(., na.rm = TRUE)

ratings_per_rater <- nrow(skills_matrix) - ratings_per_rater

ratings_per_rater_2_plus <- ratings_per_rater >= 2

table(ratings_per_rater_2_plus) # 17 with only one rating

raters_to_drop <- names(ratings_per_rater_2_plus)[!is.na(match(ratings_per_rater_2_plus, FALSE))]

skills_matrix <- skills_matrix %>% select(!all_of(raters_to_drop))

# Check ratings per agency again (2020)

skills_matrix <- skills_matrix %>%
  rowwise() %>%
  mutate(n_ratings = sum(!is.na(c_across(!c(agency, n_ratings))))) %>%
  ungroup()

table(skills_matrix$n_ratings < 5) # All agencies have 5 ratings

# Save number of ratings per agency

skills_n_2020 <- skills_matrix %>%
  select(agency, n_ratings) %>%
  mutate(year = 2020)

save(skills_n_2020, file = "data/ratings/skills_ratings/skills_n_ratings_2020.RData")

rm(skills_n_2020)

# Drop n_ratings
skills_matrix <- skills_matrix %>%
  select(!c(n_ratings)) # Note skills_matrix has 1375 columns - 1 agency column, 1373 raters, 1 n_agency column

# Clean up

rm(n, ratings_per_rater, ratings_per_rater_2_plus)

#### Prepare the data for JAGS ####

skills_matrix_for_jags <- skills_matrix %>%
  arrange(agency) %>%
  tibble::column_to_rownames(var = "agency") # Now data frame has 1373 columns for the 1373 raters with at least 2 ratings

agency_index <- tibble(agency = row.names(skills_matrix_for_jags),
                       index = 1:nrow(skills_matrix_for_jags))

agency_index <- left_join(agency_index, skills_inf_priors, by = c("agency" = "agency_rated"))

# For agencies with 5 or fewer ratings set precision equal to 0.04 which is sd = 5, var = 25;
# Set informed mean to 3 if no informed ratings

agency_index <- agency_index %>%
  mutate(skills_inf_se = sqrt(skills_inf_var / n),
         precision = 1/skills_inf_se,
         precision = if_else(n <= 5 | is.na(precision), 0.04, precision),
         skills_inf_mean = if_else(is.na(skills_inf_mean), 3, skills_inf_mean))

rater_index <- tibble(rater = names(skills_matrix_for_jags)) %>%
  left_join(., rater_data, by = "rater")

#### Load and format the 2014 ratings ####

#### Load the 2014 ratings ####

skills_names_2014 <- read_csv(file = "skills_ratings/data/ratings/skills_ratings/skills_ratings_2014/skills_names.csv")

skills_priors_2014 <- read_csv(file = "skills_ratings/data/ratings/skills_ratings/skills_ratings_2014/skills_prior.csv")

skills_ratings_2014 <- read_csv(file = "skills_ratings/data/ratings/skills_ratings/skills_ratings_2014/skills_ratings.csv",
                                col_types = cols(.default = col_double()))

names(skills_ratings_2014)

skills_ratings_2014 <- skills_ratings_2014 %>% select(!skills_n_ratings) # Drop the ratings counter

#### Format the ratings to match 2020 format ####

skills_t <- t(skills_ratings_2014) # transpose - raters are columns, rows are agencies (and col. names automatically mapped to row names)

skills_t <- as_tibble(skills_t, rownames = "agency_var")

skills_matrix_2014 <- full_join(skills_names_2014, skills_t, by =c("names" = "agency_var")) %>%
  rename(agency = items) %>%
  select(!names)

colnames(skills_matrix_2014) <- c("agency", paste0("rater_", 1:(ncol(skills_matrix_2014) - 1)))

rm(skills_t, skills_names_2014)

#### Format priors to match 2020 ####

skills_priors_2014 <- skills_priors_2014 %>%
  rename(agency = Agency,
         skills_inf_mean = mean,
         n = N,
         precision = pre) %>%
  mutate(agency = str_remove_all(agency, "\\s\\(All\\)")) %>%
  mutate(precision = if_else(n <= 5, 0.04, precision),
         skills_inf_mean = if_else(n == 0, 2.5, skills_inf_mean))

#### Get ratings per agency (2014) ####

skills_matrix_2014 <- skills_matrix_2014 %>%
  rowwise() %>%
  mutate(n_ratings = sum(!is.na(c_across(!c(agency))))) %>%
  ungroup()

n <- skills_matrix_2014 %>%
  select(agency, n_ratings)

skills_matrix_2014 <- skills_matrix_2014 %>% filter(n_ratings >= 5) # Require at least 5 ratings per agency

# Check ratings per rater (2014)

ratings_per_rater <- skills_matrix_2014 %>%
  select(!c(agency, n_ratings)) %>%
  as.matrix() %>%
  is.na() %>%
  colSums(., na.rm = TRUE)

ratings_per_rater <- nrow(skills_matrix_2014) - ratings_per_rater

ratings_per_rater_2_plus <- ratings_per_rater >= 2

table(ratings_per_rater_2_plus) # 115 with only one rating; 1,247 with 0 ratings (because started with all respondents)

raters_to_drop <- names(ratings_per_rater_2_plus)[!is.na(match(ratings_per_rater_2_plus, FALSE))]

skills_matrix_2014 <- skills_matrix_2014 %>% select(!all_of(raters_to_drop))

# Check ratings per agency again (2014)

skills_matrix_2014 <- skills_matrix_2014 %>%
  rowwise() %>%
  mutate(n_ratings = sum(!is.na(c_across(!c(agency, n_ratings))))) %>%
  ungroup()

table(skills_matrix_2014$n_ratings < 5) # All agencies have 5 ratings

# Save number of ratings per agency

skills_n_2014 <- skills_matrix_2014 %>%
  select(agency, n_ratings) %>%
  mutate(year = 2014)

save(skills_n_2014, file = "skills_ratings/data/ratings/skills_ratings/skills_n_ratings_2014.RData")

rm(skills_n_2014)

skills_matrix_2014 <- skills_matrix_2014 %>% select(!n_ratings) # Drop n_ratings variable

#### Prepare the data for JAGS (2014) ####

skills_matrix_for_jags_2014 <- skills_matrix_2014 %>%
  arrange(agency) %>%
  tibble::column_to_rownames(var = "agency") # Now data frame has 2190 columns for the 2190 raters with at least 2 ratings

agency_index_2014 <- tibble(agency = row.names(skills_matrix_for_jags_2014))

agency_index_2014 <- left_join(agency_index_2014, skills_priors_2014, by = "agency")

rater_index_2014 <- tibble(rater = names(skills_matrix_for_jags_2014))

#### Join 2020 and 2014 data for passing to JAGS ####

names(skills_matrix_for_jags_2014) <- paste0(names(skills_matrix_for_jags_2014), "_2014")

rownames(skills_matrix_for_jags_2014) <- paste0(rownames(skills_matrix_for_jags_2014), "_2014")

skills_matrix_for_jags_all <- bind_rows(skills_matrix_for_jags,
                                        skills_matrix_for_jags_2014)

nrow(skills_matrix_for_jags_all) == nrow(skills_matrix_for_jags) + nrow(skills_matrix_for_jags_2014)

ncol(skills_matrix_for_jags_all) == ncol(skills_matrix_for_jags) + ncol(skills_matrix_for_jags_2014)

# All ratings should be NA if rater was not in that survey

sum(!is.na((skills_matrix_for_jags_all %>%
        select(!contains("_2014")) %>%
        slice( (nrow(skills_matrix_for_jags) + 1):nrow(skills_matrix_for_jags_2014) )))) # All 2020 raters have NA for 2014 columns

sum(!is.na((skills_matrix_for_jags_all %>%
              select(contains("_2014")) %>%
              slice( 1:nrow(skills_matrix_for_jags) )))) # All 2014 raters have NA for 2020 columns

agency_index_all <- bind_rows(mutate(agency_index, year = 2020),
                              mutate(agency_index_2014, year = 2014))

agency_index_all <- agency_index_all %>%
  mutate(index = 1:n())

rater_index_all <- bind_rows(mutate(rater_index, year = 2020),
                             mutate(rater_index_2014, year = 2014))

rater_index_all <- rater_index_all %>%
  mutate(index = 1:n())

# Check names on agency index and ratings matrix

table(agency_index_all$agency == str_remove(rownames(skills_matrix_for_jags_all), "_2014")) # All names match
        
#### Assemble the data for JAGS ####

# Initialize vectors

y <- vector(mode = "numeric") # Ratings
agency <- vector(mode = "numeric") # Agency index
rater <- vector(mode = "numeric") # Rater index

# Populate vectors

for (i in 1:nrow(skills_matrix_for_jags_all)) {
  for (j in 1:ncol(skills_matrix_for_jags_all)) {
   
    if (!is.na(skills_matrix_for_jags_all[i, j])) {
      
      y <- c(y, skills_matrix_for_jags_all[i, j])
      
      agency <- c(agency, agency_index_all$index[i])
      
      rater <- c(rater, rater_index_all$index[j])
      
    }
  }
}

N <- length(y) # Number of ratings
A <- length(unique(agency)) # Number of agencies
R <- length(unique(rater)) # Number of raters
Ws <- diag(2) # Prior scale matrix for the Wishart prior - set it to the identity matrix

# Create a list object for JAGS

for_jags <- list(y = y, N = N,
                 agency = agency, A = A,
                 rater = rater, R = R,
                 u = pull(agency_index_all, skills_inf_mean),
                 pre = pull(agency_index_all, precision),
                 Ws = Ws)

for_jags_dump <- dump.format(for_jags)

#### Write the JAGS model ####

model <- "model {
  for(i in 1:N){ #loop over N ratings
    
    y[i] ~ dnorm(y.hat[i], tau.y[agency[i]])
    
    y.hat[i] <- alpha[rater[i]] + beta[rater[i]] * x[agency[i]]
    
  }
  
  # Prior latent agency ideology and agency variance
  for(a in 1:A){
    
    x[a] ~ dnorm(u[a], pre[a])
    
    sigma.y[a] ~ dunif(0, 100) # Standard deviation
    tau.y[a] <- pow(sigma.y[a], -2) # Convert to precision
    
  }
  
  for(r in 1:R){
  
    alpha[r] <- xi.a*B.raw[r, 1]
    beta[r]  <- xi.b*B.raw[r, 2]
    
    B.raw[r, 1:2] ~ dmnorm(B.raw.hat[r,], Tau.B.raw[,])
    B.raw.hat[r, 1] <- mu.a.raw
    B.raw.hat[r, 2] <- mu.b.raw
  
  }
  
  mu.a <- xi.a*mu.a.raw
  mu.b <- xi.b*mu.b.raw
  
  mu.a.raw ~ dnorm(0, 0.0001)
  mu.b.raw ~ dnorm(0, 0.0001)
  
  xi.a ~ dunif(0, 100)
  xi.b ~ dunif(0, 100)
  
  Tau.B.raw[1:2, 1:2] ~ dwish(Ws[,], df)
  df <- 3
  
  Sigma.B.raw[1:2, 1:2] <- inverse(Tau.B.raw[,])
  
  sigma.a <- xi.a*sqrt(Sigma.B.raw[1, 1])
  sigma.b <- xi.b*sqrt(Sigma.B.raw[2, 2])

  rho <- Sigma.B.raw[1, 2]/sqrt(Sigma.B.raw[1, 1]*Sigma.B.raw[2, 2])

}"

#### Estimate the model ####

# Set initial values - need to set random number generator seeds when using parallel chains

inits <- list(list(x = rep.int(1, times = A), .RNG.name = "base::Mersenne-Twister", .RNG.seed = 1),
              list(x = rep.int(5, times = A), .RNG.name = "base::Mersenne-Twister", .RNG.seed = 5))

# Execute the simulation

setwd("data/ratings/skills_ratings")

skills_posteriors <- run.jags(model = model, monitor = c("alpha", "beta", "x", "mu.a", "mu.b", "sigma.a", "sigma.b", "rho"),
                            data = for_jags_dump, inits = inits, n.chains = 2,
                            adapt = 1000, burnin = 10000,
                            sample = 4000, thin = 10,
                            method = "parallel",
                            keep.jags.files = TRUE)

save.image(file = "skills_ratings_image.RData")

#### Check convergence of ratings ####

sample <- as.mcmc.list(skills_posteriors)

gd <- gelman.diag(sample[,(2*R + 1):(2*R + A)])

quantile(gd[[1]][,1]) # Point estimate
# Median is 1.0006 and max is 1.01

quantile(gd[[1]][,2]) # Upper 95% C.I.
# Median is 1.001 and max is 1.04

#### Get ratings ####

xi <- as.matrix(sample[,(2*R + 1):(2*R + A)])

xi <- (xi - mean(xi)) / sd(xi) # Local identification

xi <- as.data.frame(t(xi))

xi_tidy <- as_tibble(xi) %>%
  mutate(agency = factor(agency_index_all$agency),
         year = agency_index_all$year) %>%
  select(agency, year, everything())

xi_tidy_all <- xi_tidy %>%
  rowwise() %>%
  mutate(skills_mean = mean(c_across(V1:V8000)),
         skills_sd = sd(c_across(V1:V8000)),
         skills_lb = quantile(c_across(V1:V8000), p = 0.025),
         skills_ub = quantile(c_across(V1:V8000), p = 0.975)) %>%
  select(!c(V1:V8000))

xi_tidy_2014 <- xi_tidy_all %>%
  filter(year == 2014)

log_print(xi_tidy_2014 %>% slice(1:10))

readr::write_csv(xi_tidy_2014,
                 file = "skills_2014.csv")

log_close()
